# List of required packages
required_packages <- c(
  "tidyverse",   # Data manipulation and visualization
  "nnet",        # Multinomial logistic regression
  "knitr",       # Table formatting
  "kableExtra",  # Enhanced table formatting
  "patchwork"    # Plot combining
)

# Install any missing packages
new_packages <- required_packages[!(required_packages %in% installed.packages()[, "Package"])]
if (length(new_packages)) install.packages(new_packages)

# Load all required packages
lapply(required_packages, library, character.only = TRUE)

# Create output directories
dir.create("outputs", showWarnings = FALSE)
dir.create("outputs/tables", showWarnings = FALSE)
dir.create("outputs/figures", showWarnings = FALSE)

# Define party order (Abstainer as reference, but will not be plotted)
party_order <- c("Abstainer/null", "Podemos", "Sumar", "PSOE", "Other", "PP", "SALF", "VOX")
plot_parties <- c("Podemos", "Sumar", "PSOE", "Other", "PP", "SALF", "VOX")  # Parties to show in plots

# Define party colors
party_colors <- c(
  "Podemos" = "#984EA3",        # Purple
  "Sumar" = "#FF69B4",          # Hot Pink
  "PSOE" = "#E41A1C",           # Red
  "VOX" = "#4DAF4A",            # Green
  "PP" = "#377EB8",             # Blue
  "Other" = "#808080",
  "SALF" = "#A6761D"            # Brown
)

# Define survey labels for more readable names
survey_labels <- c(
  "andpol" = "Original survey", 
  "cis_bjul" = "CIS July", 
  "cis_bjun" = "CIS June", 
  "db40_july" = "40dB July", 
  "cis_campaign" = "CIS EP Campaign", 
  "cis_pos" = "CIS EP Post-electoral", 
  "cis_pre" = "CIS EP Pre-electoral", 
  "db40_june" = "40dB June", 
  "gesop" = "GESOP", 
  "ees" = "EES", 
  "db40_august" = "40dB August",
  "Overall" = "Overall"
)

# Define variable groups as requested
# Substantive ideology variables
gender_sexuality_vars <- c("heteropride", "policy_antigenderbinary", "against_same_sex_marriage", "woman_traditional")
immigration_vars <- c("policy_proimmigration","against_restrictive_immigration")
economy_vars <- c("policy_proredistribution", "against_redistribution")

# Understandings of democracy
democracy_understanding_vars <- c("democracy_equality", "democracy_justice", "democracy_freedom", "democracy_welfare", 
                                  "democracy_diversity", "democracy_insecurity", "democracy_fraud", "democracy_inefficacy", 
                                  "democracy_weakness", "democracy_disorder")

# Democratic evaluations
democracy_evals_vars <- c("swd", "external_efficacy", "dem_support_lite", "dem_index")

# Catalan attitudes
catalan_attitudes_vars <- c("forbid_parties", "suspend_autonomy", "supress_school_expression",
                            "judiciary_backsliding", "media_backsliding", "monarchy_backsliding")

# Identity variables
identity_vars <- c("identity_comparative", "identity_region", "identity_spanish")

# Media variables
media_vars <- c("fake_news", "info_social_media", "tiktok_info", "telegram_info")

# International variables
international_vars <- c("pro_ue", "ue_views", "assistance_ukraine", "external_efficacy_ue")

# Combine all variable groups
all_vars <- unique(c(
  gender_sexuality_vars, immigration_vars, economy_vars, 
  democracy_understanding_vars, democracy_evals_vars, 
  catalan_attitudes_vars, identity_vars, media_vars, international_vars
))

# Create variable labels (readable names)
var_labels <- setNames(
  all_vars,  # Use variable names as default labels
  all_vars
)

# Variables to reverse-code (higher = more right-wing/conservative) for consistency
reverse_vars <- c(
  "policy_proimmigration", "policy_proredistribution", "against_restrictive_immigration",
  "democracy_equality", "democracy_justice", "democracy_freedom", "democracy_welfare", 
  "democracy_diversity", "pro_ue", "assistance_ukraine", "inequality", 
  "swd", "external_efficacy", "dem_support_lite", "dem_index"
)

# Load and preprocess data
data <- read_csv("merged_survey_data.csv") %>%
  filter(party_pref %in% party_order) %>%
  mutate(
    lr = if_else(survey == "gesop", lr * 2, lr),
    gender = if_else(gender %in% c(NA, "Other"), NA_character_, gender),
    gender = factor(gender, levels = c("Female", "Male")),
    # Create binary education variable (tertiary or less)
    edu_binary = case_when(
      edu_fct == "Tertiary studies" ~ "Tertiary",
      !is.na(edu_fct) ~ "Non-tertiary",
      TRUE ~ NA_character_
    ),
    edu_binary = factor(edu_binary, levels = c("Non-tertiary", "Tertiary")),
    # Set Abstainer as reference level
    party_pref = factor(party_pref, levels = party_order),
    # Ensure survey is a factor for models
    survey = factor(survey)
  )

#==========================================================================================
# PRE-PROCESS CATEGORICAL VARIABLES
#==========================================================================================

# Function to convert categorical variables to numeric
convert_categorical_to_numeric <- function(df) {
  # Binary variables (Yes/No) to 0/1
  binary_vars <- c("info_social_media", "tiktok_info", "telegram_info")
  
  for(var in binary_vars) {
    if(var %in% names(df) && is.factor(df[[var]])) {
      df[[var]] <- as.numeric(df[[var]] == "Yes")
      message("Converted binary variable ", var, " to numeric (0/1)")
    }
  }
  
  # Ordinal variable ue_views to numeric scale
  if("ue_views" %in% names(df) && is.factor(df[[var]])) {
    df[["ue_views"]] <- case_when(
      df[["ue_views"]] == "Bad" ~ 0,
      df[["ue_views"]] == "Neither" ~ 0.5,
      df[["ue_views"]] == "Good" ~ 1,
      TRUE ~ NA_real_
    )
    message("Converted ordinal variable ue_views to numeric scale (0-1)")
  }
  
  return(df)
}

# Convert character variables to factors first
handle_character_vars <- function(df) {
  # Character variables that should be factors
  char_vars <- c("sex_or", "info_social_media", "tiktok_info", "telegram_info", "ue_views")
  
  for(var in char_vars) {
    if(var %in% names(df) && is.character(df[[var]])) {
      df[[var]] <- factor(df[[var]])
      message("Converted ", var, " to factor")
    }
  }
  
  return(df)
}

# Apply character variable conversion, then numeric conversion
data <- handle_character_vars(data)
data <- convert_categorical_to_numeric(data)

#==========================================================================================
# FUNCTION TO SCALE VARIABLES TO 0-1 RANGE
#==========================================================================================

scale_variables <- function(df, variables) {
  scaled_df <- df
  
  for(var in variables) {
    # Check if variable exists in the dataset
    if(var %in% names(df)) {
      # Check if variable is numeric/integer
      if(is.numeric(df[[var]])) {
        # Calculate min and max, excluding missing values
        var_min <- min(df[[var]], na.rm = TRUE)
        var_max <- max(df[[var]], na.rm = TRUE)
        
        # Check if min and max are different to avoid division by zero
        if(var_min != var_max) {
          # Scale to 0-1 range
          scaled_df[[var]] <- (df[[var]] - var_min) / (var_max - var_min)
          message("Scaled ", var, " to 0-1 range")
          
          # Reverse code if necessary
          if(var %in% reverse_vars) {
            scaled_df[[var]] <- 1 - scaled_df[[var]]
            message("  - Reverse-coded for consistent interpretation")
          }
        } else {
          message("Warning: Variable ", var, " has constant value. Not scaling.")
        }
      } else {
        message("Warning: Variable ", var, " is not numeric. Cannot scale.")
      }
    } else {
      message("Warning: Variable ", var, " not found in dataset.")
    }
  }
  
  return(scaled_df)
}

#==========================================================================================
# IDENTIFY AVAILABLE VARIABLES BY SURVEY
#==========================================================================================

check_variable_availability <- function(df, variables, min_cases = 100) {
  availability <- map_dfr(variables, function(var) {
    # Check if variable exists in the dataset
    if(!(var %in% names(df))) {
      return(tibble(
        variable = var,
        survey = NA_character_,
        n_cases = 0,
        available = FALSE
      ))
    }
    
    map_dfr(unique(df$survey), function(survey_name) {
      # Count non-missing cases per survey for this variable
      n_cases <- df %>% 
        filter(survey == survey_name) %>%
        filter(!is.na(!!sym(var))) %>%
        nrow()
      
      tibble(
        variable = var,
        survey = survey_name,
        n_cases = n_cases,
        available = n_cases >= min_cases
      )
    })
  })
  
  return(availability)
}

#==========================================================================================
# FUNCTION TO EXTRACT MODEL FIT STATISTICS
#==========================================================================================

get_model_fit <- function(model, data) {
  # Log-likelihood
  ll <- logLik(model)[1]
  
  # AIC
  aic_value <- AIC(model)
  
  # BIC
  bic_value <- BIC(model)
  
  # Calculate McFadden's R²
  # First, the null model (intercept only)
  null_model <- tryCatch({
    multinom(party_pref ~ 1, data = data, trace = FALSE)
  }, error = function(e) {
    return(NULL)
  })
  
  mcfadden_r2 <- if(!is.null(null_model)) {
    ll_null <- logLik(null_model)[1]
    1 - (ll / ll_null)
  } else {
    NA
  }
  
  # Number of observations
  n_obs <- nrow(data)
  
  # Degrees of freedom
  df <- attr(logLik(model), "df")
  
  # Return as a named list
  list(
    logLik = ll,
    AIC = aic_value,
    BIC = bic_value,
    McFadden_R2 = mcfadden_r2,
    n_obs = n_obs,
    df = df
  )
}

#==========================================================================================
# FUNCTION TO RUN MULTINOMIAL MODELS FOR A SINGLE VARIABLE
#==========================================================================================

run_variable_multinom <- function(var_name, survey_name = NULL) {
  # Filter data based on survey (if specified) or use all data
  if(!is.null(survey_name)) {
    model_data <- data %>% 
      filter(survey == survey_name) %>%
      drop_na(!!sym(var_name), party_pref, gender, age, edu_binary)
    survey_label <- survey_name
    include_survey <- FALSE
  } else {
    # Check which surveys have this variable
    available_surveys <- variable_availability %>%
      filter(variable == var_name, available) %>%
      pull(survey)
    
    if(length(available_surveys) == 0) {
      message("No survey has sufficient data for variable: ", var_name)
      return(NULL)
    }
    
    model_data <- data %>% 
      filter(survey %in% available_surveys) %>%
      drop_na(!!sym(var_name), party_pref, gender, age, edu_binary)
    
    # If only one survey has data for this variable, use the survey name instead of "Overall"
    if(length(available_surveys) == 1) {
      survey_label <- available_surveys[1]
    } else {
      survey_label <- "Overall"
    }
    
    include_survey <- length(available_surveys) > 1
  }
  
  # Check if we have enough data
  if(nrow(model_data) < 100) {
    message("Insufficient data for variable ", var_name, " in ", 
            ifelse(is.null(survey_name), "all surveys", survey_name))
    return(NULL)
  }
  
  # Create formula - use survey as control if using combined data
  if(include_survey && length(unique(model_data$survey)) > 1) {
    formula <- as.formula(paste("party_pref ~", var_name, "+ gender + age + edu_binary + survey"))
    message("Running model for ", var_name, " with survey as control (", 
            paste(unique(model_data$survey), collapse=", "), ")")
  } else {
    formula <- as.formula(paste("party_pref ~", var_name, "+ gender + age + edu_binary"))
    message("Running model for ", var_name, " in ", 
            ifelse(survey_label == "Overall", "combined data", survey_label))
  }
  
  # Fit multinomial model
  model <- tryCatch({
    multinom(formula, data = model_data, trace = FALSE, maxit = 500)
  }, error = function(e) {
    message("Error in model for variable ", var_name, ": ", e$message)
    return(NULL)
  })
  
  if(is.null(model)) return(NULL)
  
  # Extract coefficients and standard errors for log-odds
  coefs <- summary(model)$coefficients
  ses <- summary(model)$standard.errors
  z_values <- coefs / ses
  p_values <- 2 * (1 - pnorm(abs(z_values)))
  
  # Create a dataframe with log-odds for the specific variable
  log_odds <- data.frame()
  for(i in 1:nrow(coefs)) {
    party <- rownames(coefs)[i]
    # Get the column index for our variable
    var_col <- grep(paste0("^", var_name, "$"), colnames(coefs))
    
    if(length(var_col) > 0) {
      estimate <- coefs[i, var_col]
      std.error <- ses[i, var_col]
      p.value <- p_values[i, var_col]
      
      # Add significance stars
      stars <- case_when(
        p.value < 0.001 ~ "***",
        p.value < 0.01 ~ "**",
        p.value < 0.05 ~ "*",
        p.value < 0.1 ~ ".",
        TRUE ~ ""
      )
      
      log_odds <- rbind(log_odds, data.frame(
        party = party,
        estimate = estimate,
        std.error = std.error,
        stars = stars,
        stringsAsFactors = FALSE
      ))
    }
  }
  
  # Calculate model fit statistics
  model_fit <- get_model_fit(model, model_data)
  
  return(list(
    variable = var_name,
    survey = survey_label,
    log_odds = log_odds,
    n = nrow(model_data),
    model_fit = model_fit,
    surveys_used = if(include_survey) unique(model_data$survey) else survey_label,
    is_combined = include_survey && length(unique(model_data$survey)) > 1
  ))
}

#==========================================================================================
# FUNCTION TO RUN ANALYSIS FOR A GROUP OF VARIABLES
#==========================================================================================

run_group_analysis <- function(var_group, group_name, target_surveys = NULL) {
  message("\n=== Analyzing variable group: ", group_name, " ===")
  
  # Run models for each variable in the group
  overall_models <- map(var_group, ~run_variable_multinom(.x))
  names(overall_models) <- var_group
  
  # Filter out NULL results
  valid_overall_models <- compact(overall_models)
  
  # If target surveys specified, run individual survey models
  individual_models <- list()
  if(!is.null(target_surveys) && length(target_surveys) > 0) {
    message("\n--- Running individual survey models for ", group_name, " variables ---")
    
    individual_models <- map(var_group, function(var_name) {
      survey_models <- map(target_surveys, function(survey) {
        result <- run_variable_multinom(var_name, survey)
        if(is.null(result)) return(NULL)
        return(result)
      })
      names(survey_models) <- target_surveys
      compact(survey_models)
    })
    names(individual_models) <- var_group
    
    individual_models <- compact(individual_models)
  }
  
  return(list(
    group_name = group_name,
    overall_models = valid_overall_models,
    individual_models = individual_models
  ))
}

#==========================================================================================
# FUNCTION TO CREATE VISUALIZATIONS FOR VARIABLE GROUP (LOG-ODDS ONLY)
#==========================================================================================

# Updated create_group_plots function with all requested improvements

# Create a mapping of variable names to more meaningful, short names
var_readable_names <- c(
  # Gender and Sexuality
  "heteropride" = "Pro-Heterosexual Pride Day",
  "policy_antigenderbinary" = "Anti-Non-Binary Identity",
  "against_same_sex_marriage" = "Anti-Same Sex Marriage",
  "woman_traditional" = "Traditional Women's Role",
  
  # Immigration
  "policy_proimmigration" = "Immigration as Culturally Good (reversed)",
  "against_restrictive_immigration" = "Pro-Immigration Policy (reversed)",
  
  # Economy
  "policy_proredistribution" = "Pro-Government Redistribution (reversed)",
  "against_redistribution" = "Anti-Redistribution Preferences",
  
  # Democracy Understanding
  "democracy_equality" = "Democracy = Equality (reversed)",
  "democracy_justice" = "Democracy = Justice (reversed)",
  "democracy_freedom" = "Democracy = Freedom (reversed)",
  "democracy_welfare" = "Democracy = Welfare (reversed)",
  "democracy_diversity" = "Democracy = Diversity (reversed)",
  "democracy_insecurity" = "Democracy = Insecurity",
  "democracy_fraud" = "Democracy = Fraud",
  "democracy_inefficacy" = "Democracy = Inefficacy",
  "democracy_weakness" = "Democracy = Weakness",
  "democracy_disorder" = "Democracy = Disorder",
  
  # Democracy Evaluations
  "swd" = "Satisfaction with Democracy (reversed)",
  "external_efficacy" = "External Efficacy (reversed)",
  "dem_support_lite" = "Democracy Support (reversed)",
  "dem_index" = "Democratic Meaning Index (reversed)",
  
  # Catalan Attitudes
  "forbid_parties" = "Forbid Pro-Independence Parties",
  "suspend_autonomy" = "Suspend Catalan Autonomy",
  "supress_school_expression" = "Suppress School Expression",
  "judiciary_backsliding" = "Judicial Backsliding",
  "media_backsliding" = "Media Backsliding",
  "monarchy_backsliding" = "Monarchy Backsliding",
  
  # Identity
  "identity_comparative" = "Regional vs Spanish Identity",
  "identity_region" = "Regional Identity",
  "identity_spanish" = "Spanish Identity",
  
  # Media
  "fake_news_right" = "Fake News (Right)",
  "fake_news_left" = "Fake News (Left)",
  "fake_news" = "Fake News Perceptions",
  "info_social_media" = "Social Media Info",
  "tiktok_info" = "TikTok Info",
  "telegram_info" = "Telegram Info",
  
  # International
  "pro_ue" = "Support for EU integration (reversed)",
  "ue_views" = "EU Views",
  "assistance_ukraine" = "Help Ukraine (reversed)",
  "external_efficacy_ue" = "External Efficacy EU"
)

create_group_plots <- function(group_results, min_err_bar = 0.05, jitter_height = 0.2) {
  group_name <- group_results$group_name
  overall_models <- group_results$overall_models
  individual_models <- group_results$individual_models
  
  # Extract log-odds from overall models
  log_odds_data <- map_dfr(overall_models, function(model) {
    if(is.null(model$log_odds)) return(NULL)
    
    model$log_odds %>%
      filter(party %in% plot_parties) %>%
      mutate(
        variable = model$variable,
        survey = model$survey,  # Now can be specific survey name if only one data source
        sample_size = model$n,
        is_combined = model$is_combined  # Flag to indicate if this is a combined model (multiple surveys)
      )
  })
  
  # Extract data from individual survey models if available
  if(length(individual_models) > 0) {
    individual_log_odds_data <- map_dfr(individual_models, function(var_models) {
      map_dfr(var_models, function(model) {
        if(is.null(model$log_odds)) return(NULL)
        
        model$log_odds %>%
          filter(party %in% plot_parties) %>%
          mutate(
            variable = model$variable,
            survey = model$survey,
            sample_size = model$n,
            is_combined = FALSE  # Individual survey models are never combined
          )
      })
    })
    
    # Add individual models to the log_odds_data
    log_odds_data <- bind_rows(log_odds_data, individual_log_odds_data)
  }
  
  # Check if we have data to plot
  if(nrow(log_odds_data) == 0) {
    message("No data available for plotting ", group_name)
    return(NULL)
  }
  
  # Ensure party factor levels are maintained
  log_odds_data$party <- factor(log_odds_data$party, levels = plot_parties)
  
  # Define updated survey shapes and labels
  survey_shapes <- c(
    "andpol" = 1,           # Hollow circle
    "cis_bjul" = 2,         # Hollow triangle
    "cis_bjun" = 0,         # Hollow square
    "db40_july" = 5,        # Hollow diamond
    "cis_campaign" = 6,     # Hollow upside-down triangle
    "cis_pos" = 3,          # Hollow plus
    "cis_pre" = 4,          # Hollow cross
    "db40_june" = 7,        # Hollow square with cross
    "gesop" = 8,            # Hollow diamond with cross
    "ees" = 9,              # Hollow dot with center
    "db40_august" = 10,     # Hollow three-pointed star
    "Overall" = 16          # Filled circle (for combined surveys only)
  )
  
  # Check for outliers in log-odds data
  outliers <- log_odds_data %>%
    group_by(variable, party) %>%
    mutate(
      median_est = median(estimate, na.rm = TRUE),
      mad = mad(estimate, na.rm = TRUE),
      is_extreme = abs(estimate - median_est) > 5 * mad
    ) %>%
    filter(is_extreme)
  
  if(nrow(outliers) > 0) {
    message("Identified ", nrow(outliers), " extreme outliers in log-odds data")
    print(outliers %>% select(variable, survey, party, estimate))
    
    # Filter out extreme outliers from plot data
    log_odds_data_clean <- log_odds_data %>%
      anti_join(outliers %>% select(variable, survey, party), by = c("variable", "survey", "party"))
  } else {
    log_odds_data_clean <- log_odds_data
  }
  
  # Function to create a plot for a single variable
  create_variable_plot <- function(data, var_name, y_var, y_label, err_var = "std.error") {
    # Get a readable name for this variable
    readable_name <- var_readable_names[var_name]
    if(is.na(readable_name)) readable_name <- var_name  # Fallback to original name if not in mapping
    
    var_data <- data %>% 
      filter(variable == var_name) %>%
      mutate(
        lower = !!sym(y_var) - pmax(!!sym(err_var), min_err_bar),
        upper = !!sym(y_var) + pmax(!!sym(err_var), min_err_bar),
        # Use survey_labels to convert survey codes to readable labels
        survey_readable = ifelse(survey %in% names(survey_labels), 
                                 survey_labels[survey], 
                                 as.character(survey))
      )
    
    if(nrow(var_data) == 0) {
      return(NULL)
    }
    
    # Get unique surveys in this plot
    unique_surveys <- unique(var_data$survey)
    
    # Check if both cis_bjun and cis_bjul are present (and no other surveys except possibly "Overall")
    has_both_cis <- all(c("cis_bjun", "cis_bjul") %in% unique_surveys) && 
      all(unique_surveys %in% c("cis_bjun", "cis_bjul", "Overall"))
    
    # Separate Overall data from individual survey data
    overall_data <- var_data %>% filter(survey == "Overall")
    survey_data <- var_data %>% filter(survey != "Overall")
    
    # For stronger jittering to prevent overlaps, set a seed for consistent jitter
    set.seed(123 + which(unique(log_odds_data$variable) == var_name))
    
    # Prepare jitter position for consistency between points and labels
    jitter_pos <- position_jitter(width = 0, height = 0.2)
    
    # Get a list of all surveys in this plot for the shape scale
    plot_surveys <- unique(var_data$survey)
    plot_shapes <- survey_shapes[names(survey_shapes) %in% plot_surveys]
    
    # Create the plot with parties on y-axis
    p <- ggplot() +
      # Add reference line at 0
      geom_vline(xintercept = 0, linetype = "dotted", color = "gray50", alpha = 0.5)
    
    # Add error bars for Overall data
    if(nrow(overall_data) > 0) {
      p <- p + geom_errorbarh(
        data = overall_data,
        aes(y = party, xmin = lower, xmax = upper, color = party),
        height = 0.2
      )
    }
    
    # Add individual survey data
    if(nrow(survey_data) > 0) {
      # For cis_bjun and cis_bjul when they appear together, don't show error bars
      if(has_both_cis) {
        # Add points only without error bars
        p <- p + geom_point(
          data = survey_data, 
          aes(y = party, x = !!sym(y_var), color = party, 
              size = sample_size, shape = survey),
          position = jitter_pos,
          alpha = 0.5,
          stroke = 1
        )
      } else {
        # Add error bars for individual surveys
        p <- p + geom_errorbarh(
          data = survey_data,
          aes(y = party, xmin = lower, xmax = upper, color = party),
          height = 0.2,
          alpha = 0.5
        )
        
        # Add points with error bars
        p <- p + geom_point(
          data = survey_data, 
          aes(y = party, x = !!sym(y_var), color = party, 
              size = sample_size, shape = survey),
          alpha = 0.7,
          stroke = 1
        )
      }
    }
    
    # Add Overall points
    if(nrow(overall_data) > 0) {
      p <- p + geom_point(
        data = overall_data, 
        aes(y = party, x = !!sym(y_var), color = party, size = sample_size),
        shape = 16,  # Filled circle for overall
        alpha = 0.7
      )
    }
    
    # Add value labels for Overall and all survey estimates
    p <- p + geom_text(
      data = var_data, 
      aes(y = party, x = !!sym(y_var), label = sprintf("%.2f", !!sym(y_var))),
      color = "black", 
      vjust = -0.7, 
      hjust = 0.5, 
      size = 3.5,
      position = if(has_both_cis && nrow(survey_data) > 0) jitter_pos else position_identity()
    )
    
    # Styling
    p <- p + 
      scale_color_manual(values = party_colors, guide = "none") +
      scale_size_continuous(name = "Sample size (N)", range = c(2, 8)) +
      theme_minimal() +
      theme(
        panel.grid.minor = element_blank(),
        axis.title.y = element_blank(),
        legend.position = "bottom"
      ) +
      labs(
        x = y_label,
        y = "party",
        title = readable_name,
        caption = if(nrow(outliers) > 0) "Note: Extreme outliers removed for better visualization" else ""
      )
    
    # Add shape scale with survey_labels for the legend
    if(length(plot_shapes) > 0) {
      survey_names <- names(plot_shapes)
      survey_readable_names <- ifelse(survey_names %in% names(survey_labels), 
                                      survey_labels[survey_names], 
                                      survey_names)
      
      p <- p + scale_shape_manual(
        name = "Survey", 
        values = plot_shapes,
        labels = survey_readable_names,
        drop = FALSE  # This prevents dropping unused levels from the legend
      )
      
      # Add a guides() call to ensure the legend appears
      p <- p + guides(shape = guide_legend(override.aes = list(size = 3)))
    }
    
    return(p)
  }
  
  # Function to create a special social media plot
  create_social_media_plot <- function(data) {
    # Get data for info_social_media
    var_name <- "info_social_media"
    readable_name <- var_readable_names[var_name]
    
    var_data <- data %>% 
      filter(variable == var_name) %>%
      mutate(
        lower = estimate - pmax(std.error, min_err_bar),
        upper = estimate + pmax(std.error, min_err_bar)
      )
    
    if(nrow(var_data) == 0) {
      return(NULL)
    }
    
    # Get unique surveys for this variable
    message("Social Media Info surveys: ", paste(unique(var_data$survey), collapse=", "))
    
    # Create the base plot
    p <- ggplot() +
      # Add reference line at 0
      geom_vline(xintercept = 0, linetype = "dotted", color = "gray50", alpha = 0.5) +
      # Add error bars
      geom_errorbarh(
        data = var_data,
        aes(y = party, xmin = lower, xmax = upper, color = party),
        height = 0.2,
        alpha = 0.7
      ) +
      # Add points
      geom_point(
        data = var_data,
        aes(y = party, x = estimate, color = party, size = sample_size),
        alpha = 0.8
      ) +
      # Add value labels
      geom_text(
        data = var_data,
        aes(y = party, x = estimate, label = sprintf("%.2f", estimate)),
        color = "black",
        vjust = -0.7,
        hjust = 0.5,
        size = 3.5
      ) +
      # Styling
      scale_color_manual(values = party_colors, guide = "none") +
      scale_size_continuous(name = "Sample size (N)", range = c(2, 8)) +
      theme_minimal() +
      theme(
        panel.grid.minor = element_blank(),
        axis.title.y = element_blank(),
        legend.position = "bottom"
      ) +
      labs(
        x = "Log-Odds Coefficient",
        y = "party",
        title = readable_name,
        caption = if(nrow(outliers) > 0) "Note: Extreme outliers removed for better visualization" else ""
      )
    
    # Create dummy data for the legend
    dummy_data <- data.frame(
      x = rep(min(var_data$estimate), 2),
      y = rep(levels(var_data$party)[1], 2),
      survey = c("cis_campaign", "cis_pos")
    )
    
    # Add fake layer for legend (won't be visible but will generate the legend)
    p <- p + 
      geom_point(
        data = dummy_data,
        aes(x = x, y = y, shape = survey),
        alpha = 0,  # Make them invisible
        show.legend = TRUE
      ) +
      scale_shape_manual(
        name = "Survey",
        values = c("cis_campaign" = 6, "cis_pos" = 3),
        labels = c("cis_campaign" = "CIS EP Campaign", "cis_pos" = "CIS EP Post-electoral"),
        drop = FALSE
      ) +
      guides(shape = guide_legend(override.aes = list(alpha = 1, size = 3, color = "black")))
    
    return(p)
  }
  
  # Create plots for each variable in the group
  var_names <- unique(log_odds_data$variable)
  
  logodds_plots <- map(var_names, function(var_name) {
    if(var_name == "info_social_media") {
      # Use the special social media plotting function
      return(create_social_media_plot(log_odds_data_clean))
    } else {
      # Use the standard plotting function for other variables
      return(create_variable_plot(log_odds_data_clean, var_name, "estimate", "Log-Odds Coefficient"))
    }
  })
  names(logodds_plots) <- var_names
  logodds_plots <- compact(logodds_plots)
  
  # Combine log-odds plots into a grid - no titles or subtitles
  if(length(logodds_plots) > 0) {
    combined_logodds_plot <- wrap_plots(logodds_plots, ncol = 2) +
      plot_annotation(
        caption = if(nrow(outliers) > 0) "Note: Extreme outliers removed for better visualization" else ""
      )
    
    ggsave(
      file.path("outputs/figures", paste0("figure_", tolower(gsub(" ", "_", group_name)), "_logodds.png")), 
      combined_logodds_plot, width = 12, height = 4 * ceiling(length(logodds_plots) / 2), bg = "white"
    )
  }
  
  message("Created plots for ", group_name, " variables")
  return(NULL)
}


#==========================================================================================
# FUNCTION TO CREATE TABLES FOR VARIABLE GROUP
#==========================================================================================

create_group_tables <- function(group_results) {
  group_name <- group_results$group_name
  overall_models <- group_results$overall_models
  
  # Extract log-odds from overall models
  logodds_table_data <- map_dfr(overall_models, function(model) {
    if(is.null(model$log_odds)) return(NULL)
    
    # Check if log_odds has the necessary columns
    if(!all(c("party", "estimate", "std.error", "stars") %in% names(model$log_odds))) {
      message("Missing required columns in log_odds data for variable ", model$variable)
      return(NULL)
    }
    
    model$log_odds %>%
      filter(party %in% plot_parties) %>%
      mutate(
        variable = model$variable,
        # Format log-odds with stars and standard errors
        formatted_logodds = sprintf("%.3f%s\n(%.3f)", estimate, stars, std.error)
      ) %>%
      select(variable, party, formatted_logodds)
  })
  
  # If we have no valid table data, return early
  if(nrow(logodds_table_data) == 0) {
    message("No valid log-odds data for table in group: ", group_name)
    return(NULL)
  }
  
  # Reshape to wide format for log-odds
  logodds_table_wide <- logodds_table_data %>%
    pivot_wider(
      id_cols = variable,
      names_from = party,
      values_from = formatted_logodds
    ) %>%
    # Add information about model sources and sample sizes
    mutate(
      survey_source = map_chr(overall_models, function(model) {
        if(is.null(model)) return(NA_character_)
        
        # Check if surveys_used is a character vector or a single string
        surveys_used <- if(is.character(model$surveys_used) && length(model$surveys_used) == 1) {
          model$surveys_used
        } else {
          paste(model$surveys_used, collapse = ", ")
        }
        
        # Replace survey codes with survey_labels
        for(survey_code in names(survey_labels)) {
          surveys_used <- gsub(
            paste0("\\b", survey_code, "\\b"), 
            survey_labels[survey_code], 
            surveys_used
          )
        }
        
        return(surveys_used)
      })[variable],
      n_cases = map_dbl(overall_models, ~if(is.null(.x)) NA_integer_ else .x$n)[variable]
    )
  
  # Format as LaTeX table for log-odds
  logodds_latex_table <- kable(logodds_table_wide, format = "latex", booktabs = TRUE, 
                               caption = paste0("Table: Log-Odds Coefficients of ", group_name, " Variables on Party Preference"),
                               label = paste0("tab:logodds_", tolower(gsub(" ", "_", group_name))),
                               col.names = c("Variable", plot_parties, "Survey Source", "N"),
                               escape = FALSE) %>%
    kable_styling(latex_options = c("striped", "scale_down", "hold_position")) %>%
    add_header_above(c(" " = 1, "Party (Log-Odds Coefficient)" = length(plot_parties), " " = 2)) %>%
    footnote(
      general = "Cell entries show log-odds coefficients with standard errors in parentheses. Reference category is Abstainer/null.",
      symbol = c("p < 0.1", "p < 0.05", "p < 0.01", "p < 0.001"),
      symbol_manual = c(".", "*", "**", "***"),
      threeparttable = TRUE,
      escape = FALSE
    )
  
  # Create model fit statistics table
  model_fit_table <- map_dfr(overall_models, function(model) {
    if(is.null(model) || is.null(model$model_fit)) return(NULL)
    
    # Convert survey codes to survey labels
    survey_text <- if(is.character(model$surveys_used) && length(model$surveys_used) == 1) {
      model$surveys_used
    } else {
      paste(model$surveys_used, collapse = ", ")
    }
    
    # Replace survey codes with survey_labels
    for(survey_code in names(survey_labels)) {
      survey_text <- gsub(
        paste0("\\b", survey_code, "\\b"), 
        survey_labels[survey_code], 
        survey_text
      )
    }
    
    tibble(
      Variable = model$variable,
      Survey = survey_text,
      N = model$n,
      LL = sprintf("%.2f", model$model_fit$logLik),
      AIC = sprintf("%.2f", model$model_fit$AIC),
      BIC = sprintf("%.2f", model$model_fit$BIC),
      McFadden_R2 = sprintf("%.3f", model$model_fit$McFadden_R2),
      df = model$model_fit$df
    )
  })
  
  # Format as LaTeX table for model fit
  model_fit_latex_table <- kable(model_fit_table, format = "latex", booktabs = TRUE,
                                 caption = paste0("Table: Model Fit Statistics for ", group_name, " Variables"),
                                 label = paste0("tab:fit_", tolower(gsub(" ", "_", group_name))),
                                 col.names = c("Variable", "Survey", "N", "Log-Likelihood", "AIC", "BIC", "McFadden R²", "df"),
                                 align = c("l", "l", "r", "r", "r", "r", "r", "r"),
                                 escape = FALSE) %>%
    kable_styling(latex_options = c("striped", "hold_position")) %>%
    footnote(
      general = "All models control for gender, age, and education. Models using data from multiple surveys also control for survey effects.",
      threeparttable = TRUE
    )
  
  # Write tables to files
  dir_path <- "outputs/tables"
  
  writeLines(logodds_latex_table, 
             file.path(dir_path, paste0("table_", tolower(gsub(" ", "_", group_name)), "_logodds.tex")))
  
  writeLines(model_fit_latex_table, 
             file.path(dir_path, paste0("table_", tolower(gsub(" ", "_", group_name)), "_fit.tex")))
  
  message("Created tables for ", group_name, " variables")
}

#==========================================================================================
# MAIN EXECUTION
#==========================================================================================

# Scale all variables to 0-1 range
message("\n=== Scaling all variables to 0-1 range ===")
data <- scale_variables(data, all_vars)

# Check variable availability by survey
message("\n=== Checking variable availability by survey ===")
variable_availability <- check_variable_availability(data, all_vars)

# Print summary of variable availability
availability_summary <- variable_availability %>%
  filter(available) %>%
  group_by(variable) %>%
  summarise(
    surveys = paste(survey, collapse = ", "),
    total_cases = sum(n_cases),
    .groups = "drop"
  ) %>%
  arrange(desc(total_cases))

message("\n=== Variable Availability Summary ===")
print(availability_summary)

# Define target surveys for individual analysis
target_surveys <- c("cis_bjun", "cis_bjul")

# Run analysis for each variable group
message("\n=== Running multinomial regressions for each variable group ===")

# List of variable groups to analyze with nice names
group_list <- list(
  "Gender and Sexuality" = gender_sexuality_vars,
  "Immigration" = immigration_vars,
  "Economy" = economy_vars,
  "Democracy Understanding" = democracy_understanding_vars,
  "Democracy Evaluations" = democracy_evals_vars,
  "Catalan Attitudes" = catalan_attitudes_vars,
  "Identity" = identity_vars,
  "Media" = media_vars,
  "International" = international_vars
)

# Run analyses for each group
group_results <- map2(group_list, names(group_list), 
                      ~run_group_analysis(.x, .y, target_surveys))
names(group_results) <- names(group_list)

# Create plots and tables for each group
message("\n=== Creating visualizations and tables ===")
walk(group_results, ~create_group_plots(.x))
walk(group_results, ~create_group_tables(.x))

message("Analysis completed. Output files saved.")